﻿import nltk
from nltk.corpus import brown, reuters, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# 查看可用的语料库
print("可用语料库:", nltk.corpus.corpus_list())

# 访问布朗语料库
print("布朗语料库类别:", brown.categories())
print("布朗语料库文件:", brown.fileids()[:5])

# 读取语料库中的文本
news_text = brown.words(categories='news')
print("新闻类别前20个词:", news_text[:20])

# 统计词频
fdist = nltk.FreqDist(news_text)
print("最常见的10个词:", fdist.most_common(10))

# 访问停用词
stop_words = set(stopwords.words('english'))
print("英文停用词示例:", list(stop_words)[:10])